{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ed3b36c-83f6-4223-aaa6-223dd11652a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from lxml import etree\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml import html\n",
    "import string\n",
    "import requests_file\n",
    "import time\n",
    "import glob, os\n",
    "import re\n",
    "from datetime import datetime\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c923b513-e9c7-4d60-bd5f-523e4c1c2b81",
   "metadata": {},
   "outputs": [],
   "source": [
    "def dc_roa_parser(file):\n",
    "    '''\n",
    "    This function aims to clean the 67th District Court ROAs.\n",
    "    '''\n",
    "    records = {}\n",
    "    try:\n",
    "        with open(file, \"r\") as f:\n",
    "            page = f.read()\n",
    "        html = etree.HTML(page)\n",
    "        #case level info\n",
    "        records[\"case_id\"] = html.xpath('//div[contains(@class,\"case-id\")]//text()')[0]\n",
    "        records[\"judge_name\"] = html.xpath('//div[contains(@class,\"judge-name\")]//text()')[0]\n",
    "        records[\"entitlement\"] = html.xpath('//div[contains(@class,\"entitlement\")]//text()')[0]\n",
    "        try:\n",
    "            records[\"pin\"] = html.xpath('//div[contains(@class,\"police-incidence-number\")]//text()')[0]\n",
    "        except:\n",
    "            records[\"pin\"] = \"\"\n",
    "        records[\"date_filed\"] = html.xpath('//div[contains(@class,\"filed-date\")]//text()')[0]\n",
    "        try:\n",
    "            records[\"date_closed\"] = html.xpath('//div[contains(@class,\"closed-date\")]//text()')[0]\n",
    "        except:\n",
    "            records[\"date_closed\"] = \"\"\n",
    "        records[\"case_status\"] = html.xpath('//div[contains(@class,\"case-status\")]//text()')[0]\n",
    "        try:\n",
    "            records[\"balance\"] = html.xpath('//div[contains(@class,\"balance-amount\")]//text()')[0]\n",
    "        except:\n",
    "            records[\"balance\"] = \"\"\n",
    "        records[\"party_name\"] = html.xpath('//div[@class=\"col-auto party-name ml-2\"]//text()')[0]\n",
    "        records[\"party_type\"] = html.xpath('//div[@class=\"col-auto party-type-number ml-2\"]//text()')[0]\n",
    "        records[\"age\"] = html.xpath('//div[@class=\"age ml-2\"]//text()')[0]\n",
    "        records[\"attorney_name\"] = html.xpath('//div[@class=\"attorney-name ml-2\"]//text()')[0]\n",
    "        try:\n",
    "            records[\"bond_amount\"] = html.xpath('//div[contains(@class,\"bond-amount\")]//text()')[0]\n",
    "        except:\n",
    "            records[\"bond_amount\"] = \"\"\n",
    "        #charges\n",
    "        charge_grp = html.xpath('//section[@aria-label=\"Charges\"]//div[@class=\"card\"]')\n",
    "        c=1\n",
    "        for charge in charge_grp:\n",
    "            records[\"c_current_charge{0}\".format(c)] = charge.xpath('.//div[@class=\"col-auto current-charge ml-2\"]//text()')[0]\n",
    "            records[\"c_original_charge{0}\".format(c)] = charge.xpath('.//div[@class=\"col-auto original-charge ml-2\"]//text()')[0]\n",
    "            records[\"c_officer{0}\".format(c)] = charge.xpath('.//div[@class=\"officer-agency-petitioner ml-2\"]//text()')[0]\n",
    "            records[\"c_charge_level{0}\".format(c)] = charge.xpath('.//div[@class=\"charge-level ml-2\"]//text()')[0]\n",
    "            try:\n",
    "                records[\"c_amended_reduced{0}\".format(c)] = charge.xpath('.//div[@class=\"amended-or-reduced ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_amended_reduced{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_arraignment_date{0}\".format(c)] = charge.xpath('.//div[@class=\"arraignment-date ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_arraignment_date{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_disposition_date{0}\".format(c)] = charge.xpath('.//div[@class=\"disposition-date ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_disposition_date{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_sentencing_date{0}\".format(c)] = charge.xpath('.//div[@class=\"sentencing-date ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_sentencing_date{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_attempted_conspired_solicited{0}\".format(c)] = charge.xpath('.//div[@class=\"attempted-conspired-solicited ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_attempted_conspired_solicited{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_notice{0}\".format(c)] = charge.xpath('.//div[@class=\"notice ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_notice{0}\".format(c)] = \"\"\n",
    "            try:\n",
    "                records[\"c_disposition{0}\".format(c)] = charge.xpath('.//div[@class=\"disposition ml-2\"]//text()')[0]\n",
    "            except:\n",
    "                records[\"c_disposition{0}\".format(c)] = \"\"\n",
    "            c+=1\n",
    "\n",
    "\n",
    "        #sentencing\n",
    "        try:\n",
    "            records[\"plea_hearing_held\"] = charge.xpath('//div[@class=\"col-auto mb-1 sentencing-event\"]//div[@class=\"col-auto ml-2\"]//text()')[0]\n",
    "        except:\n",
    "            records[\"plea_hearing_held\"] = \"\"\n",
    "        try:\n",
    "            records[\"incarceration_type\"] = charge.xpath('//div[@class=\"col-auto sentencing-incarceration-type ml-2\"]//text()')[0]\n",
    "        except:\n",
    "            records[\"incarceration_type\"] = \"\"\n",
    "        try:\n",
    "            records[\"minimum_term\"] = charge.xpath('//div[@class=\"ml-2 sentencing-min-term\"]//text()')[0]\n",
    "        except:\n",
    "            records[\"minimum_term\"] = \"\"\n",
    "\n",
    "        #hearings\n",
    "        hearing_grp = html.xpath('//div[@aria-label=\"Hearing\"]//div[@class=\"card\"]')\n",
    "        if len(hearing_grp)>0:\n",
    "            h = 1\n",
    "            for hearing in hearing_grp:\n",
    "                records[\"hearing_type{0}\".format(h)] = hearing.xpath('.//div[@class=\"hearing-type ml-2\"]//text()')[0]\n",
    "                records[\"hearing_date{0}\".format(h)] = hearing.xpath('.//div[@class=\"hearing-date-time ml-2\"]//text()')[0]\n",
    "                records[\"hearing_officer{0}\".format(h)] = hearing.xpath('.//div[@class=\"hearing-officer ml-2\"]//text()')[0]\n",
    "                h+=1\n",
    "\n",
    "        #events\n",
    "        event_grps = html.xpath('//div[@aria-label=\"Event\"]//div[@class=\"card\"]')\n",
    "        e=1\n",
    "        for grp in event_grps:\n",
    "            grp_date = grp.xpath('.//div[@class=\"col-auto event-date ml-2\"]//text()')[0]\n",
    "            grp_records = []\n",
    "            events = grp.xpath('.//div[@class=\"m-1 pl-2 ng-star-inserted\"]')\n",
    "            for event in events:\n",
    "                records[\"e_description{0}\".format(e)] = event.xpath('.//div[@class=\"col-auto description ml-2\"]//text()')[0]\n",
    "                try:\n",
    "                    records[\"e_comment{0}\".format(e)] = event.xpath('.//div[@class=\"comment ml-2\"]//text()')[0]\n",
    "                except:\n",
    "                    records[\"e_comment{0}\".format(e)] = \"\"\n",
    "                records[\"e_party_count{0}\".format(e)] = event.xpath('.//div[@class=\"col-auto party-and-role-and-count ml-2\"]//text()')[0]\n",
    "                records[\"e_clerk{0}\".format(e)] = event.xpath('.//div[@class=\"col-auto event-number-and-clerk ml-2\"]//text()')[0]\n",
    "                try:\n",
    "                    records[\"e_receipt_no_date_judge{0}\".format(e)] = event.xpath('.//div[@class=\"receipt-number-date-judge ml-2\"]//text()')[0]\n",
    "                except:\n",
    "                    records[\"e_receipt_no_date_judge{0}\".format(e)] = \"\"\n",
    "                try:\n",
    "                    records[\"e_attorney{0}\".format(e)] = event.xpath('.//div[@class=\"attorney ml-2\"]//text()')[0]\n",
    "                except:\n",
    "                    records[\"e_attorney{0}\".format(e)] = \"\"\n",
    "                records[\"event_date{0}\".format(e)] = grp_date\n",
    "                e+=1\n",
    "        #print(records)\n",
    "        #records = pd.DataFrame([records], columns=records.keys())\n",
    "    except:\n",
    "        print(file)\n",
    "    records = pd.DataFrame([records], columns=records.keys())\n",
    "    return(records)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e67a4b2-fb15-422c-b3cc-8764dc538f5f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "path = \"C:/Users/\"+os.login()+\"/Dropbox/IGNITE/2input_data/D67ROAs\"\n",
    "files=[]\n",
    "for file in os.listdir(path):\n",
    "    if file.endswith('.html'):\n",
    "        files.append(file)\n",
    "\n",
    "data = dc_roa_parser(path+\"/\"+files[0])\n",
    "for f in files[1:]:\n",
    "    data = pd.concat([data,dc_roa_parser(path+\"/\"+f)])\n",
    "\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b76ae289-deb5-4a25-a187-dea3d754f412",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.to_excel(\"C:/Users/xiy333/Dropbox/temp/test.xlsx\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
